Question: Does a University being Private or Public have an effect on its graduation rate and what factors specifically affect those rates?
Hypothesis: Private universities might have a higher graduation rate than public universities and that the biggest factors for the graduation rate will be: the number of students enrolled, the amount of faculty with PhD’s and economic factors of the university (Variables: Room.Board, Books, Personal and Expend).
Download Location: https://www.kaggle.com/yashgpt/us-college-data
import numpy as np
import pandas as pd
import seaborn as sb
import statsmodels.formula.api as smf
from scipy.stats import norm
from ThinkStats2 import thinkplot
from ThinkStats2 import thinkstats2
from matplotlib import pyplot as plt
from ThinkStats2.thinkstats2 import Mean, MeanVar, Var, Std, Cov
class DiffMeansPermute(thinkstats2.HypothesisTest):
def TestStatistic(self, data):
group1, group2 = data
test_stat = abs(group1.mean() - group2.mean())
return test_stat
def MakeModel(self):
group1, group2 = self.data
self.n, self.m = len(group1), len(group2)
self.pool = np.hstack((group1, group2))
def RunModel(self):
np.random.shuffle(self.pool)
data = self.pool[:self.n], self.pool[self.n:]
return data
class DiffStdPermute(DiffMeansPermute):
def TestStatistic(self, data):
group1, group2 = data
test_stat = group1.std() - group2.std()
return test_stat
class ChiSquares(thinkstats2.HypothesisTest):
def MakeModel(self):
firsts, others = self.data
self.n = len(firsts)
self.pool = np.hstack((firsts, others))
pmf = thinkstats2.Pmf(self.pool)
self.values = range(35, 44)
self.expected_probs = np.array(pmf.Probs(self.values))
def RunModel(self):
np.random.shuffle(self.pool)
data = self.pool[:self.n], self.pool[self.n:]
return data
def TestStatistic(self, data):
firsts, others = data
stat = self.ChiSquared(firsts) + self.ChiSquared(others)
return stat
def ChiSquared(self, lengths):
hist = thinkstats2.Hist(lengths)
observed = np.array(hist.Freqs(self.values))
expected = self.expected_probs * len(lengths)
stat = sum((observed - expected) ** 2 / expected)
return stat
class SlopeTest(thinkstats2.HypothesisTest):
def TestStatistic(self, data):
ages, weights = data
_, slope = thinkstats2.LeastSquares(ages, weights)
return slope
def MakeModel(self):
_, weights = self.data
self.ybar = weights.mean()
self.res = weights - self.ybar
def RunModel(self):
ages, _ = self.data
weights = self.ybar + np.random.permutation(self.res)
return ages, weights
def FalseNegRate(data, num_runs=1000):
group1, group2 = data
count = 0
for i in range(num_runs):
sample1 = thinkstats2.Resample(group1)
sample2 = thinkstats2.Resample(group2)
ht = DiffMeansPermute((sample1, sample2))
p_value = ht.PValue(iters=101)
if p_value > 0.05:
count += 1
return count / num_runs
def createHist(data, varList: list, histLabels: list):
fig = plt.figure()
fig.subplots_adjust(hspace=0.8, wspace=0.5)
fig.set_size_inches(13.5, 10)
i = 1
for var in varList:
fig.add_subplot(3, 3, i)
sb.distplot(pd.Series(data[var], name=histLabels[i-1]), fit=norm, kde=False).set_title(var + " Histogram")
i += 1
fig.tight_layout()
plt.show()
def getPubPrivCount(data):
yesCounter = 0
noCounter = 0
for i in range(len(data["Private"])):
if data["Private"][i] == "Yes":
yesCounter += 1
else:
noCounter += 1
return yesCounter, noCounter
def getSummaryStats(data, varList: list, tails: list):
i = 0
for col in varList:
print('Variable: ' + col + '\n===========================\n Tail: ' + tails[i] +
'\n Mean: {:>16,.5f}\n Variance\n (Spread): {:>16,.5f}'
'\nStandard\nDeviation: {:>16,.5f}\n Mode(s): '
.format(data[col].mean(), data[col].var(), data[col].std()))
print('' + str(data[col].mode()) + '\n')
i += 1
def PearsonCorr(xs, ys):
xs = np.asarray(xs)
ys = np.asarray(ys)
meanx, varx = thinkstats2.MeanVar(xs)
meany, vary = thinkstats2.MeanVar(ys)
corr = thinkstats2.Cov(xs, ys, meanx, meany) / np.sqrt(varx * vary)
return corr
def SpearmanCorr(xs, ys):
xranks = pd.Series(xs).rank()
yranks = pd.Series(ys).rank()
return PearsonCorr(xranks, yranks)
def LeastSquares(xs, ys):
meanx, varx = MeanVar(xs)
meany = Mean(ys)
slope = Cov(xs, ys, meanx, meany) / varx
inter = meany - slope * meanx
return inter, slope
def FitLine(xs, inter, slope):
fit_xs = np.sort(xs)
fit_ys = inter + slope * fit_xs
return fit_xs, fit_ys
def Residuals(xs, ys, inter, slope):
xs = np.asarray(xs)
ys = np.asarray(ys)
res = ys - (inter + slope * xs)
return res
def PlotPercentiles(means, cdfs):
thinkplot.PrePlot(3)
for percent in [75, 50, 25]:
weight_percentiles = [cdf.Percentile(percent) for cdf in cdfs]
label = '%dth' % percent
thinkplot.Plot(means, weight_percentiles, label=label)
def PlotConfidenceIntervals(xs, inters, slopes, percent=90, **options):
fys_seq = []
for inter, slope in zip(inters, slopes):
fxs, fys = FitLine(xs, inter, slope)
fys_seq.append(fys)
p = (100 - percent) / 2
percents = p, 100 - p
low, high = thinkstats2.PercentileRows(fys_seq, percents)
thinkplot.FillBetween(fxs, low, high, **options)
def SampleRows(df, nrows, replace=False):
indices = np.random.choice(df.index, nrows, replace=replace)
sample = df.loc[indices]
return sample
def ResampleRows(df):
return SampleRows(df, len(df), replace=True)
def SamplingDistributions(data, iters=101):
data = data.rename(columns = {'Grad.Rate': 'Grad_Rate'})
t = []
for _ in range(iters):
sample = ResampleRows(data)
gradRate = sample.Grad_Rate
enroll = sample.Enroll
estimates = LeastSquares(gradRate, enroll)
t.append(estimates)
inters, slopes = zip(*t)
return inters, slopes
def CoefDetermination(ys, res):
return 1 - Var(res) / Var(ys)
collegeData = pd.read_csv('College_Data.csv')
print("This dataset has: {:,} columns and {:,} rows".format(collegeData.shape[1], collegeData.shape[0]))
collegeData.head()
priv, pub = getPubPrivCount(collegeData)
print("Since the \"Private\" variable is a categorical variable (not a number), I counted the number of "
"universities in this dataset that are Private and Public:\n\nPrivate: {}\n Public: {}\n============\n"
" Total: {}"
.format(priv, pub, priv + pub))
varList = ["Apps", "Accept", "Enroll", "Room.Board", "Books", "Personal", "PhD", "Expend", "Grad.Rate"]
histLabels = ["Number of Applications", "Number of Accepted Students", "Number of Student Enrollments",
"Room and Board Cost ($)", "Book Cost ($)", "Estimated Personal Costs ($)",
"Percent of Professors with PhD's (%)", "Instructional Expenditure Per Student ($)",
"Graduation Rate (%)"]
createHist(collegeData, varList, histLabels)
This finds the top 10 smallest and largest values for each of the variables in this dataset to help identify any outliers that might need some attention.
for col in varList:
print('Top 10 Smallest values from: ' + col + '\n' + str(collegeData[col].nsmallest(10)) +
'\n\nTop 10 Largest values from: ' + col + '\n' + str(collegeData[col].nlargest(10)) + '\n\n')
I will save you the trouble of looking through the output but there are two variables that have outliers that need some attention: PhD and Grad.Rate
print('University Name: ' + str(collegeData['Uni.Name'][582]) +
'\nPercentage of Faculty with PhD\'s: ' + str(collegeData['PhD'][582]) +
'%\n\nUniversity Name: ' + str(collegeData['Uni.Name'][95]) +
'\nGraduation Rate: ' + str(collegeData['Grad.Rate'][95]) + '%')
Texas A&M University at Galveston lists a faculty PhD percentage of 103% which, in the context of the variable measured, is impossible because the variable is measuring the number of faculty with at least one PhD and thus if a person has multiple PhD’s they will only be counted as one.
Cazenovia College lists a graduation rate of 118% which is also impossible because you can't have more students graduated than the number of students in the graduating class.
For the above reasons, these two data points will be removed from the dataset as they are more than likely a data collection error.
# 582 is the index of Texas A&M University at Galveston in the dataset
# 95 is the index of Cazenovia College in the dataset
collegeData = collegeData.drop(index=[95, 582])
# Taken from the variable histograms above
tails = ['Right', 'Right', 'Right', 'None', 'None', 'Right', 'Left', 'Right', 'None']
getSummaryStats(collegeData, varList, tails)
For the purposes of this analysis, i am comparing the graduation rates of Public and Private universities to see which one is higher and to see which variables have the greatest significance in determining that grad rate. As a result, I will be splitting the dataset into two parts: Public and Private universities.
# Split dataset into two parts: Public and Private universities
publicUni = collegeData[collegeData['Private'] == 'No']
privateUni = collegeData[collegeData['Private'] == 'Yes']
I decided to take a look and see if public or private universities have a higher percentage of faculty with PhD's.
# Create the PMF's of PhD's
publicPhD = thinkstats2.Pmf(publicUni['PhD'], label='Public')
privatePhD = thinkstats2.Pmf(privateUni['PhD'], label='Private')
# Plot the PMF's
fig = plt.figure()
plt.rcParams['axes.titlesize'] = 16
fig.set_size_inches(16.5, 10)
thinkplot.set_font_size(5)
thinkplot.Pmfs([publicPhD, privatePhD])
thinkplot.Config(title='PMF of Private and Public Graduation Rates', xlabel='Percent of Faculty with PhD\'s')
plt.legend(prop={"size":15})
Judging from the graph above, it would appear that public universities have a higher percentage of faculty with PhD's compared to private universities.
public_cdf = thinkstats2.Cdf(publicUni['Grad.Rate'], label='Public')
private_cdf = thinkstats2.Cdf(privateUni['Grad.Rate'], label='Private')
fig = plt.figure()
fig.set_size_inches(16.5, 10)
plt.rcParams['axes.titlesize'] = 16
thinkplot.set_font_size(5)
thinkplot.Cdfs([public_cdf, private_cdf])
thinkplot.Config(title='CDF of Private and Public Graduation Rates', xlabel='Graduation Rate (%)', ylabel='CDF')
plt.legend(prop={"size":15})
From the above graph we can see that public universities consistently have a higher graduation rate than private ones. This tells me that my initial hypothesis that private universities would have a higher graduation rate is not correct.
I will need to investigate more to find which variables are significantly contributing the difference in grad rate between public and private universities.
fig = plt.figure()
fig.set_size_inches(16, 10)
fig.subplots_adjust(hspace=0.8, wspace=0.5)
plt.rcParams['axes.titlesize'] = 16
labels = ['All', 'Public', 'Private']
xs = [-4, 4]
for i in range(1, 4):
fig.add_subplot(2, 2, i)
thinkplot.set_font_size(5)
if i == 1:
mean, var = thinkstats2.TrimmedMeanVar(collegeData[varList[8]], p=0.01)
std = np.sqrt(var)
elif i == 2:
mean, var = thinkstats2.TrimmedMeanVar(publicUni[varList[8]], p=0.01)
std = np.sqrt(var)
elif i == 3:
mean, var = thinkstats2.TrimmedMeanVar(privateUni[varList[8]], p=0.01)
std = np.sqrt(var)
fxs, fys = thinkstats2.FitLine(xs, mean, std)
thinkplot.Plot(fxs, fys, linewidth=4, color='0.8')
if i == 1:
xs, ys = thinkstats2.NormalProbability(collegeData[varList[8]])
thinkplot.Plot(xs, ys, label= labels[i-1] + ' Universities')
elif i == 2:
xs, ys = thinkstats2.NormalProbability(publicUni[varList[8]])
thinkplot.Plot(xs, ys, label=labels[i-1])
elif i == 3:
xs, ys = thinkstats2.NormalProbability(privateUni[varList[8]])
thinkplot.Plot(xs, ys, label=labels[i-1])
thinkplot.Config(title='Normal Probability Plot of ' + labels[i-1] + ' Graduation Rates',
xlabel='Standard Deviations from Mean',
ylabel='Graduation Rate (%)')
fig.tight_layout()
From the Normal Probability Plots above, we can see that the distribution of All university graduation rates is approximately normal with a slight tail on the left side.
Public and Private graduation rate plots makes it clear that their distributions also follow normality but with more pronounced tails on both ends.
The graphs also show that the distribution of all the graduation rates (public and private) is approximately normal and that there isn't significant skewness to the data that could disproportionally sway the analysis.
fig = plt.figure()
fig.set_size_inches(16, 10)
fig.subplots_adjust(hspace=0.8, wspace=0.25)
plt.rcParams['axes.titlesize'] = 16
labels = ['Public', 'Private']
for i in range(1, 3):
fig.add_subplot(1, 2, i)
thinkplot.set_font_size(5)
if i == 1:
thinkplot.Scatter(publicUni['Accept'], publicUni['Enroll'], alpha=.60, s=10)
plt.plot(np.unique(publicUni['Accept']),
np.poly1d(np.polyfit(publicUni['Accept'], publicUni['Enroll'], 1))(np.unique(publicUni['Accept'])))
elif i == 2:
thinkplot.Scatter(privateUni['Accept'], privateUni['Enroll'], alpha=.25, s=10)
plt.plot(np.unique(privateUni['Accept']),
np.poly1d(np.polyfit(privateUni['Accept'], privateUni['Enroll'], 1))(np.unique(privateUni['Accept'])))
thinkplot.Config(title=labels[i-1] + ' Acceptence vs. New Enrollments',
xlabel='Students Accepted',
ylabel='New Students Enrolled',
legend=False)
print('|==============================|\n|\tAccept vs. Enroll |'
'\n|==========|=========|=========|\n|\t | Public | Private |\n'
'|----------|---------|---------|\n'
'| Pearson | {:.5f} | {:.5f} |\n|----------|---------|---------|\n'
'| Spearman | {:.5f} | {:.5f} |\n|==========|=========|=========|'
.format(PearsonCorr(publicUni['Accept'], publicUni['Enroll']),
PearsonCorr(privateUni['Accept'], privateUni['Enroll']),
SpearmanCorr(publicUni['Accept'], publicUni['Enroll']),
SpearmanCorr(privateUni['Accept'], privateUni['Enroll'])))
From the two scatter plots and correlation matrix above (detailing both Pearson and Spearman Corrs), we can see that the number of accepted and enrolled students in both public (Pearson: 0.879) and private (Pearson: 0.906) universities is positively strongly correlated and even more so when using Spearman's correlation (Public: 0.932, Private: 0.928) when compared to Pearson's.
This indicates that the number of students accepted and enrolled is linearly related and that as the number of students accepted into the university increases, then the number of new student enrollments will also increase a proportional amount.
chiSquare = ChiSquares(gradRates)
chi_Actual = chiSquare.actual
print('Hypothesis Tests - Public and Private Grad Rate\n'
'\nDifference in Means:\n'
'\tP-value: {}'
'\n\nDifference in Standard Deviations:\n'
'\tP-value: {}\n'
'\nChi-Squares:\n'
'\tP-value: {}\n'
'\t Actual: {:.5f}\n'
'\t Ts Max: {:.5f}\n'
'\nFalse Negative Rate: {}'.format(DiffMeansPermute(gradRates).PValue(),
DiffStdPermute(gradRates).PValue(),
chiSquare.PValue(), chi_Actual,
chiSquare.MaxTestStat(),
FalseNegRate(gradRates)))
The first hypothesis test conducted is the Difference in Means, which resulted in a p-value of zero (odds are its not exactly zero but very small). Since the p-value is smaller than 0.05
, the null hypothesis can be rejected.
The next test is the Difference in Standard Deviations, which resulted in a p-value of 0.99
. Since this p-value way greater than 0.05
, the null hypothesis cannot be rejected. However, since the private dataset (564) has quite a few more values than the public dataset (211), the private standard deviation will be much larger and thus skewing the results.
After that test is the Chi-Squares test, which resulted in a p-value of 0.002
and is much lower than 0.05
(as a result we can reject the null hypothesis).
The final test was to see the False Negative Rate, which resulted in a value of 0.0
(odds are its not exactly zero but very small). Which is a good thing because that means that the power of this test has high statistical significance.
I first started with a Least Squares fit to see what the type of relationship of my variables have, specifically if they are linearly related or not.
inter, slope = LeastSquares(collegeData['Grad.Rate'], collegeData['Enroll'])
fit_xs, fit_ys = FitLine(collegeData['Grad.Rate'], inter, slope)
fig = plt.figure()
fig.set_size_inches(16, 10)
plt.rcParams['axes.titlesize'] = 16
thinkplot.set_font_size(5)
thinkplot.Scatter(collegeData['Grad.Rate'], collegeData['Enroll'], color='blue', alpha=0.6, s=10)
thinkplot.Plot(fit_xs, fit_ys, color='white', linewidth=3)
thinkplot.Plot(fit_xs, fit_ys, color='red', linewidth=2)
thinkplot.Config(title='Least Squares Best Fit - Graduation Rates vs. New Student Enrollments',
xlabel="Graduation Rate (%)",
ylabel='New Student Enrollments',
legend=False)
This is to see if the relationship between Grad.Rate and Enroll is linear by plotting a CDF of the percentiles.
collegeData['residual'] = Residuals(collegeData['Grad.Rate'], collegeData['Enroll'], inter, slope)
collegeData = collegeData.rename(columns = {'Grad.Rate': 'Grad_Rate'})
bins = np.arange(10, 100, 8)
indices = np.digitize(collegeData['Grad_Rate'], bins)
groups = collegeData.groupby(indices)
gradRate_means = [group.Grad_Rate.mean() for _, group in groups][1:-1]
cdfs = [thinkstats2.Cdf(group.residual) for _, group in groups][1:-1]
fig = plt.figure()
fig.set_size_inches(16, 10)
plt.rcParams['axes.titlesize'] = 16
PlotPercentiles(gradRate_means, cdfs)
thinkplot.set_font_size(5)
thinkplot.Config(title='CDF Percentile Ranks - Graduation Rate vs. New Student Enrollments',
xlabel="Graduation Rate (%)",
ylabel='Residual', legend=True)
plt.legend(prop={"size":15})
collegeData = collegeData.rename(columns = {'Grad_Rate': 'Grad.Rate'})
The above graph shows the 25th, 50th, and 75th percentiles of the Least Squares residuals.
Since the percentile lines are angled (or otherwise curved) in the residuals, this suggests that there is a non-linear relationship between Graduation Rate and Enrollments.
Next, to see the uncertainty of the estimated slope and intercept, a fitted line can be created for each of the resampled estimates and plot them on top of each other.
inters, slopes = SamplingDistributions(collegeData, iters=1001)
fig = plt.figure()
fig.set_size_inches(16, 10)
plt.rcParams['axes.titlesize'] = 16
PlotConfidenceIntervals(gradRate_means, inters, slopes, percent=90,
color='gray', alpha=0.3, label='90% CI')
PlotConfidenceIntervals(gradRate_means, inters, slopes, percent=60,
color='gray', alpha=0.5, label='60% CI')
PlotConfidenceIntervals(gradRate_means, inters, slopes, percent=30,
color='gray', alpha=0.5, label='30% CI')
thinkplot.set_font_size(5)
thinkplot.Config(title='Confidence Intervals for Fitted Values of Graduation Rate',
xlabel="Graduation Rate (%)",ylabel='Residual')
plt.legend(prop={"size":15})
The above graph shows the confidence interval for the fitted values at each Graduation Rate.
Checking the r squared (r2) value of the variance of the residuals to the variance of my dependent variable (Grad.Rate)
inter, slope = LeastSquares(collegeData['Grad.Rate'], collegeData['Enroll'])
res = Residuals(collegeData['Grad.Rate'], collegeData['Enroll'], inter, slope)
r2 = CoefDetermination(collegeData['Enroll'], res)
print(round(r2, 8))
From the r squared value above, I can deduce that the number of student enrollments into a university predicts a very, very small part of the variance in the graduation rate.
This next test is to see whether the observed slope is statistically significant.
ht = SlopeTest((collegeData['Grad.Rate'], collegeData['Enroll']))
slopeP_Value = ht.PValue()
sampling_cdf = thinkstats2.Cdf(slopes)
fig = plt.figure()
fig.set_size_inches(16, 10)
plt.rcParams['axes.titlesize'] = 16
thinkplot.PrePlot(2)
thinkplot.Plot([0, 0], [0, 1], color='0.8')
ht.PlotCdf(label='Null Hypothesis')
thinkplot.set_font_size(5)
thinkplot.Cdf(sampling_cdf, label='Sampling Distribution')
thinkplot.Config(title='Slope Hypothesis Test - Grad Rate and New Student Enrollments',
xlabel='Slope (Grad Rate / Enroll)',
ylabel='CDF',
xlim=[-15, 15],
legend=True, loc='upper left')
plt.xticks(np.arange(min([-14]), max([14]) + 1, 2.0))
plt.legend(prop={"size":15})
pvalue = sampling_cdf[0]
print('P-Value from sampling CDF: ' + str(round(pvalue, 5)))
From the above graph, we can see that the slopes have pretty much the same shape with the only difference being their means. Null Hypothesis is at 0 and the observed slope is at around -1.
The P-value of the sampled slope (0.775) indicates that this is not at all statistically significant, which further lends credence to the Enroll variable not having a strong relationship the graduation rates of the universities.
Next, I did some multiple regression of my variables using Grad.Rate as the dependent variable.
formula = 'Q("Grad.Rate") ~ Private + Apps + Accept + Enroll + Q("Room.Board") + Books + Personal + PhD + Expend'
model = smf.ols(formula, data=collegeData)
results = model.fit()
results.summary()
The above output is a model using of all my variables (from the entirety of my dataset) with Grad.Rate as the dependent variable. In order to make a better model I need to first take a look at which of my variables are statistically significant for trying to explain the dependent variable (Grad.Rate).
To find which variables are significant, I am looking for p-values of 0.05 or less and I can find those values under the P>|t| column. From there I can see that the variables Enroll (0.828), Books (0.429), and Expend (0.261) all have p-values larger than 0.05 (thus not statistically significant to this model) and should be removed.
Rerun the regression model with all variables that have p-value greater than 0.05 removed (Enroll, Books, and Expend).
formula = 'Q("Grad.Rate") ~ Private + Apps + Accept + Q("Room.Board") + Personal + PhD'
model = smf.ols(formula, data=collegeData)
results = model.fit()
results.summary()
Looking under the P>|t| column again in the above output, I can see that there are no longer any variables in this model that have a p-value greater than 0.05 and thus they are statistically significant to explaining Grad.Rate.
Question: Does a University being Private or Public have an effect on its graduation rate and what factors specifically affect those rates?
Hypothesis: Private universities might have a higher graduation rate than public universities and the biggest factors for the graduation rate will be: the number of students enrolled, the amount of faculty with PhD’s and economic factors of the university (Variables: Room.Board, Books, Personal and Expend).
From my restated question and hypothesis above, my conclusion based on this data and my analysis is that public universities actually have a higher graduation rate than private universities.
Furthermore, the number of new student enrollments have a statistically insignificant effect on the grad rate and the variables that have significant effects are: the number of new student applications (Apps), number of new students accepted (Accept), cost of room and board (Room.Board), the amount of personal expenses for the student (Personal), and the percentage of faculty with PhD's (PhD).
This also means that my hypothesis was only partially correct about the economic factors being the main variables that effect the grad rate, with the cost of books (Books) and expenditure (Expend) not having a significant effect.